Question 1

Recreate the animated gapminder plot for the full set of years.

First, use the following to read the three following data into R:

gdp_per_cap <- 
  read.csv(
    "income_per_person_gdppercapita_ppp_inflation_adjusted.csv", 
    header = TRUE, 
    stringsAsFactors = FALSE,
    check.names = FALSE
    )
life_exp <- 
  read.csv(
    "life_expectancy_years.csv",
    header = TRUE,
    stringsAsFactors = FALSE,
    check.names = FALSE
    )
pop <-
  read.csv(
    "population_total.csv",
    header = TRUE,
    stringsAsFactors = FALSE,
    check.names = FALSE
  )

Create a single tidy dataframe that includes all the data you need to recreate the plot for all the years up to and including 2020. The structure of your data should be similar to that of the gapminder dataset that is provided in the gapminder package:

data(gapminder, package = "gapminder")
head(gapminder)
##       country continent year lifeExp      pop gdpPercap
## 1 Afghanistan      Asia 1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia 1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia 1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia 1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia 1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia 1977  38.438 14880372  786.1134
library(tidyr)
library(dplyr)
library(countrycode)

# Reshape gdp_per_cap
gdp_long <- pivot_longer(gdp_per_cap, cols = -country, names_to = "year", values_to = "gdp_per_cap")

# Reshape life_exp
life_exp_long <- pivot_longer(life_exp, cols = -country, names_to = "year", values_to = "life_expectancy")

# Reshape pop
pop_long <- pivot_longer(pop, cols = -country, names_to = "year", values_to = "population")
# Merge datasets
data_combined <- gdp_long %>%
  inner_join(life_exp_long, by = c("country", "year")) %>%
  inner_join(pop_long, by = c("country", "year"))
# Convert year to numeric
data_combined$year <- as.numeric(data_combined$year)

# Ensure country names are character
data_combined$country <- as.character(data_combined$country)

# Standardize country names
data_combined$country <- countrycode(data_combined$country, "country.name", "iso3c")
# Filter for years up to and including 2020
data_final <- filter(data_combined, year <= 2020)

print(data_final)
## # A tibble: 41,327 × 5
##    country  year gdp_per_cap life_expectancy population
##    <chr>   <dbl>       <int>           <dbl>      <int>
##  1 AFG      1800         603            28.2    3280000
##  2 AFG      1801         603            28.2    3280000
##  3 AFG      1802         603            28.2    3280000
##  4 AFG      1803         603            28.2    3280000
##  5 AFG      1804         603            28.2    3280000
##  6 AFG      1805         603            28.2    3280000
##  7 AFG      1806         603            28.1    3280000
##  8 AFG      1807         603            28.1    3280000
##  9 AFG      1808         603            28.1    3280000
## 10 AFG      1809         603            28.1    3280000
## # ℹ 41,317 more rows

Use the code from the introductory presentation to generate the animated plot for the entire period.

library(ggplot2)
library(plotly)

gg <- 
  ggplot(data_final, aes(gdp_per_cap, life_expectancy)) +
    geom_point(aes(size = population, frame = year, ids = country)) +
    scale_x_log10() + 
    theme(legend.title = element_blank())

ggplotly(gg)

Question 2

Compile a single tidy data frame from the two tables table1.xlsx and table2.xlsx for some monthly historical data downloaded from the U.S. Bureau of Labor Statistics (and slightly modified, somewhat nefariously, by the instructor). The data are provided as Excel sheets. You may use Excel to delete the first 11 lines of the sheets, but do not manipulate the data in any other way with Excel. Read the files into R using the read_xlsx() function from the readxl package.

library(readxl)
library(tidyverse)

table1 <- read_xlsx("table1_11.xlsx")
table2 <- read_xlsx("table2_11.xlsx")

head(table1)
## # A tibble: 6 × 14
##    Year   Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1948  23.7  23.7  23.5  23.8  24.0  24.2  24.4  24.4  24.4  24.3  24.2  24.0
## 2  1949  24.0  23.9  23.9  23.9  23.9  23.9  23.7  23.7  23.8  23.7  23.7  23.6
## 3  1950  23.5  23.6  23.6  23.6  23.8  23.9  24.1  24.2  24.3  24.5  24.6  25.0
## 4  1951  25.4  25.8  25.9  25.9  26.0  25.9  25.9  25.9  26.0  26.2  26.3  26.5
## 5  1952  26.4  26.4  26.4  26.5  26.5  26.5  26.7  26.7  26.6  26.7  26.7  26.7
## 6  1953  26.6  26.6  26.6  26.7  26.7  26.8  26.8  26.8  26.9  27.0  26.8  26.9
## # ℹ 1 more variable: `HALF1/HALF2` <chr>
head(table2)
## # A tibble: 6 × 14
##    Year   Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1  1948   3.4   3.8   4     3.9   3.5   3.6   3.6   3.9   3.8   3.7   3.8   4  
## 2  1949   4.3   4.7   5     5.3   6.1   6.2   6.7   6.8   6.6   7.9   6.4   6.6
## 3  1950   6.5   6.4   6.3   5.8   5.5   5.4   5     4.5   4.4   4.2   4.2   4.3
## 4  1951   3.7   3.4   3.4   3.1   3     3.2   3.1   3.1   3.3   3.5   3.5   3.1
## 5  1952   3.2   3.1   2.9   2.9   3     3     3.2   3.4   3.1   3     2.8   2.7
## 6  1953   2.9   2.6   2.6   2.7   2.5   2.5   2.6   2.7  NA     3.1   3.5   4.5
## # ℹ 1 more variable: Annual <dbl>

Use pivot_longer, pivot_wider and join operations wisely.

# Pivot both tables to long format
table1_long <- table1 %>%
  pivot_longer(cols = Jan:Dec, names_to = "Month", values_to = "CPI", values_drop_na = TRUE) %>%
  select(-`HALF1/HALF2`) # Assuming we're excluding the HALF1/HALF2 column for now

table2_long <- table2 %>%
  pivot_longer(cols = Jan:Dec, names_to = "Month", values_to = "Unemployment Rate", values_drop_na = TRUE) %>%
  select(-Annual) # Assuming we're excluding the Annual column for now

# Join the tables on Year and Month
combined_data <- left_join(table1_long, table2_long, by = c("Year", "Month"))

head(combined_data)
## # A tibble: 6 × 4
##    Year Month   CPI `Unemployment Rate`
##   <dbl> <chr> <dbl>               <dbl>
## 1  1948 Jan    23.7                 3.4
## 2  1948 Feb    23.7                 3.8
## 3  1948 Mar    23.5                 4  
## 4  1948 Apr    23.8                 3.9
## 5  1948 May    24.0                 3.5
## 6  1948 Jun    24.2                 3.6

Which variables will you include? Which will you not? why?

I will include the Year and Month variables. These columns are the core data needed for most analyses, representing the monthly data points across years for two different metrics. I will not include the HALF1/HALF2 and Annual columns. These columns represent summaries or aggregates that can be recalculated from the monthly data if needed. Including them in the tidy data may lead to redundancy and potentially complicate analyses that focus on monthly trends.

How will you deal with NA values?

I will use the values_drop_na = TRUE argument in the pivot_longer function to remove NA values from the long format tables. This will ensure that the final tidy data frame does not contain NA values, which can be problematic for many analyses.

Comment about any peculiarities that you identify in the data.

  1. Presence of NA Values: table2 shows an NA value for September 1953, which indicates missing data.

  2. Zero Values in 1961: table2 shows zero values for the entire year of 1961, which seems unusual and could indicate data entry errors or an anomaly in the data collection process.

  3. Negative Values in 1977: A negative value (-7.2) appears for April 1977 in table2, which is likely a data entry error, as negative unemployment rates are not feasible.

  4. High Unemployment Rate Spike in 2020: There’s a significant spike in the unemployment rate in April 2020 in table2, jumping to 14.7%. This outlier may reflect the impact of the COVID-19 pandemic on employment.

Which sanity checks can you devise to verify that your tidy table is correct (an important yet not exhaustive hint: group_by)? What do you conclude from these sanity checks?

  1. Check for completeness of months and years, so that we can ensure every year has all months represented, accounting for any expected missing values
completeness_check <- combined_data %>%
  group_by(Year) %>%
  summarise(Months_count = n_distinct(Month)) %>%
  filter(Months_count != 12)
print(completeness_check)
## # A tibble: 0 × 2
## # ℹ 2 variables: Year <dbl>, Months_count <int>

This result is expected and indicates that the data cleaning and merging processes were successful in maintaining a complete set of monthly records for each year.

  1. Verify range of CPI and unemployment rate, ensuring they fall within reasonable bounds.
summary_check <- combined_data %>%
  summarise(Min_CPI = min(CPI, na.rm = TRUE),
            Max_CPI = max(CPI, na.rm = TRUE),
            Min_Unemployment = min(`Unemployment Rate`, na.rm = TRUE),
            Max_Unemployment = max(`Unemployment Rate`, na.rm = TRUE))
print(summary_check)
## # A tibble: 1 × 4
##   Min_CPI Max_CPI Min_Unemployment Max_Unemployment
##     <dbl>   <dbl>            <dbl>            <dbl>
## 1    21.5    280.             -7.2             14.7

A minimum CPI of 21.5 and a maximum of 280, which suggests a wide range of Consumer Price Index values. The minimum value seems reasonable for historical data, assuming the dataset starts from a period when CPI values were generally lower. The maximum value suggests the dataset extends into more recent years when inflation has significantly increased CPI values.

The minimum unemployment rate is reported as -7.2, and the maximum is 14.7. The maximum unemployment rate is plausible, especially considering economic crises that could lead to higher unemployment rates. However, the minimum value of -7.2 is not feasible, as unemployment rates cannot be negative.

  1. Check for negative or zero values in CPI and unemployment rate, which may not be feasible.
anomaly_check <- combined_data %>%
  filter(CPI <= 0 | `Unemployment Rate` <= 0)
print(anomaly_check)
## # A tibble: 13 × 4
##     Year Month   CPI `Unemployment Rate`
##    <dbl> <chr> <dbl>               <dbl>
##  1  1961 Jan    29.8                 0  
##  2  1961 Feb    29.8                 0  
##  3  1961 Mar    29.8                 0  
##  4  1961 Apr    29.8                 0  
##  5  1961 May    29.8                 0  
##  6  1961 Jun    29.8                 0  
##  7  1961 Jul    29.9                 0  
##  8  1961 Aug    29.9                 0  
##  9  1961 Sep    30.0                 0  
## 10  1961 Oct    30.0                 0  
## 11  1961 Nov    30.0                 0  
## 12  1961 Dec    30.0                 0  
## 13  1977 Apr    60                  -7.2

Unemployment rates of 0 for all months of 1961 are highly unlikely and suggest a data reporting error. The negative unemployment rate for April 1977 is also an error, as mentioned earlier.

Do not type manually years or months. Find a way to create vectors that include these values from the raw data (consider rownames, colnames).

years_vector <- unique(table1$Year)
months_vector <- colnames(table1)[2:13] 

years_vector
##  [1] 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
## [16] 1963 1964 1965 1966 1967 1947 1968 1969 1970 1971 1972 1973 1974 1975 1976
## [31] 1977 1978 1979 1980 1981 1982 1983 1985 1986 1987 1988 1989 1990 1991 1992
## [46] 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
## [61] 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
months_vector
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"

Question 3

Read Cleveland’s Chapter 2, Sections 2.1-2.2.

The data referred to in these sections can be found in the attached file Cleveland_singer.rds that can be read into R using readRDS:

singer <- readRDS("Cleveland_singer.rds")

Please answer the following:

Question 4

Tidy the anscombe dataset.

Step 1: Combine the four datasets into a single table and add a column for the dataset identifier.

library(tidyverse)

data("anscombe")
head(anscombe)
##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04
anscombe_long <- pivot_longer(anscombe, 
                              cols = everything(), 
                              names_to = c(".value", "dataset"), 
                              names_pattern = "(.)(.)") %>%
  mutate(dataset = case_when(
    dataset == "1" ~ "I",
    dataset == "2" ~ "II",
    dataset == "3" ~ "III",
    dataset == "4" ~ "IV"
  ))

Step 2: Convert the dataset identifiers to categorical factors for easier analysis.

anscombe_long <- anscombe_long %>%
  mutate(dataset = as.factor(dataset))

Step 3: Ensure data types are consistent: x and y as numeric and Dataset as categorical.

anscombe_long <- anscombe_long %>%
  mutate(x = as.numeric(x), y = as.numeric(y))
head(anscombe_long)
## # A tibble: 6 × 3
##   dataset     x     y
##   <fct>   <dbl> <dbl>
## 1 I          10  8.04
## 2 II         10  9.14
## 3 III        10  7.46
## 4 IV          8  6.58
## 5 I           8  6.95
## 6 II          8  8.14

Step 4: Check any missing or duplicate data points.

sum(is.na(anscombe_long))
## [1] 0
sum(duplicated(anscombe_long))
## [1] 0